Pandas 入門

Pythonを使ったデータ解析入門 3idea | OpenBook を見ながら、Pandasの基本的な操作を写経してなれる

Pandas 基本操作

https://openbook4.me/projects/183/sections/777



In [1]:

    
# numpy と pandas を import する。np, pd と書くのは慣習っぽい
import numpy as np
import pandas as pd

Series

ドキュメント: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
ソース: https://github.com/pandas-dev/pandas/blob/master/pandas/core/series.py



In [13]:

    
# Series
# 軸にラベルを付けた1次元の配列
print(pd.Series([1,2,4]))
# 値とインデックスを設定
s = pd.Series([1,2,4], index=['a','b','c'])
print(s)
print(s.index)









    



0    1
1    2
2    4
dtype: int64
a    1
b    2
c    4
dtype: int64
Index(['a', 'b', 'c'], dtype='object')



In [14]:

    
print('最大値:', s.max())
print('最小値:', s.min())
print('平均値:', s.mean())
print('中央値:', s.median())
print('分散:', s.var()) # variance
print('合計値:', s.sum())









    



最大値: 4
最小値: 1
平均値: 2.33333333333
中央値: 2.0
分散: 2.33333333333
合計値: 7



In [20]:

    
print('剰余:')
print(s.mod(2)) # modulo, 剰余

print('\n累積:')
print(s.cumsum()) # cumulative, 累積

print('\n関数を適用:')
print(s.apply(lambda x: x*3))   # 特定の関数を各値に対して適応

print('\n値を変換:')
print(s.map({1: 10, 2: 200}))   # 引数で与えた値に対応する値を変換









    



剰余:
a    1
b    0
c    0
dtype: int64

累積:
a    1
b    3
c    7
dtype: int64

関数を適用:
a     3
b     6
c    12
dtype: int64

値を変換:
a     10.0
b    200.0
c      NaN
dtype: float64



In [23]:

    
print('最大値のindex:', s.argmax())
print('最小値のindex:', s.argmin())
print('listに変換:', s.tolist())
print('dictに変換:', s.to_dict())
print('jsonに変換:', s.to_json())









    



最大値のindex: c
最小値のindex: a
listに変換: [1, 2, 4]
dictに変換: {'c': 4, 'b': 2, 'a': 1}
jsonに変換: {"a":1,"b":2,"c":4}

DataFrame

ドキュメント: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
ソース: https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py



In [25]:

    
# DataFrameの作成
df = pd.DataFrame([[1, 4, 7], [2, 5, 8], [3, 6, 9]],
                   index = ['i1', 'i2', 'i3'],
                   columns = list("abc"))
df



In [30]:

    
# 範囲を指定して取り出す

# 指定した行の取り出し。index名かindex番号で与えられる
print('i1行目のデータ:')
print(df.ix['i1'])

print('\n1行目のデータ:')
print(df.ix[1])

print('\ni1行目のa列のデータ:')
print(df.ix['i1', 'a'])

# : は全指定
print('\na列のデータ:')
print(df.ix[:, 'a'])

# 複数の指定も配列で可能。
print(df.ix[[1, 2], ['b','c']])
print(df.ix[[1, 2], [1, 2]])









    



i1行目のデータ:
a    1
b    4
c    7
Name: i1, dtype: int64

1行目のデータ:
a    2
b    5
c    8
Name: i2, dtype: int64

i1行目のa列のデータ:
1

a列のデータ:
i1    1
i2    2
i3    3
Name: a, dtype: int64
    b  c
i2  5  8
i3  6  9
    b  c
i2  5  8
i3  6  9



In [32]:

    
# 列の指定
print('a列の全データ:')
print(df['a'])

print ('\na列の全データをarrayで取得')
print(df['a'].values)

print ('\na列とindex(i3)を指定して値を取得')
print(df['a']['i3'])









    



a列の全データ:
i1    1
i2    2
i3    3
Name: a, dtype: int64

a列の全データをarrayで取得
[1 2 3]

a列とindex(i3)を指定して値を取得
3



In [39]:

    
# iloc(integer-location) だと位置で指定できる
print(df.iloc[0]) # 0行目
print(df.iloc[0, 1]) # 0行目1列目
print(df.iloc[:,0]) # 0列目









    



a    1
b    4
c    7
Name: i1, dtype: int64
4
i1    1
i2    2
i3    3
Name: a, dtype: int64



In [8]:

    
# 条件を満たす要素に値を代入
df = pd.DataFrame(np.random.randint(3, size=(5,3)))
print(df)

# 条件を満たす要素だけを抽出する
print(df[df>1])
print(df[df==1])

# 条件を満たすものに値を代入
df[df>1] = -1
print(df)









    



   0  1  2
0  1  0  2
1  1  2  1
2  0  0  2
3  0  0  2
4  2  0  2
     0    1    2
0  NaN  NaN  2.0
1  NaN  2.0  NaN
2  NaN  NaN  2.0
3  NaN  NaN  2.0
4  2.0  NaN  2.0
     0   1    2
0  1.0 NaN  NaN
1  1.0 NaN  1.0
2  NaN NaN  NaN
3  NaN NaN  NaN
4  NaN NaN  NaN
   0  1  2
0  1  0 -1
1  1 -1  1
2  0  0 -1
3  0  0 -1
4 -1  0 -1



In [15]:

    
# NaN の削除(dropna)
df = pd.DataFrame([[1, 2, None], [3, None, 4], [5, 6, 7]])
print(df)
# なんでintとfloatまざってるんだろう...
print(df.dropna()) # NaN を含む行を削除
print(df.dropna(axis=1)) # NaN を含む列を削除
print(df.dropna(subset=[1])) # 特定の列を指定することも可能









    



   0    1    2
0  1  2.0  NaN
1  3  NaN  4.0
2  5  6.0  7.0
   0    1    2
2  5  6.0  7.0
   0
0  1
1  3
2  5
   0    1    2
0  1  2.0  NaN
2  5  6.0  7.0



In [18]:

    
# NaN を埋める(fillna)
print(df)
print(df.fillna(-1)) # 指定した値で埋める
print(df.fillna(method='pad')) # 直前の値で埋める
print(df.fillna(method='bfill')) # 直後の値で埋める









    



   0    1    2
0  1  2.0  NaN
1  3  NaN  4.0
2  5  6.0  7.0
   0    1    2
0  1  2.0 -1.0
1  3 -1.0  4.0
2  5  6.0  7.0
   0    1    2
0  1  2.0  NaN
1  3  2.0  4.0
2  5  6.0  7.0
   0    1    2
0  1  2.0  4.0
1  3  6.0  4.0
2  5  6.0  7.0



In [19]:

    
# misssing valueの前後の線形の値で埋める
print(df)
print(df.apply(pd.Series.interpolate))









    



   0    1    2
0  1  2.0  NaN
1  3  NaN  4.0
2  5  6.0  7.0
   0    1    2
0  1  2.0  NaN
1  3  4.0  4.0
2  5  6.0  7.0



In [32]:

    
# 重複した値の処理(dupulicated)
df = pd.DataFrame([['a', 1], ['a', 1], ['a', 2], ['b', 3], ['b', 4]])
print(df)

print(df.duplicated()) # 重複しているデータを調べる
print(df.duplicated(0)) # 0列の重複しているデータを調べる
print(df.duplicated(1))









    



   0  1
0  a  1
1  a  1
2  a  2
3  b  3
4  b  4
0    False
1     True
2    False
3    False
4    False
dtype: bool
0    False
1     True
2     True
3    False
4     True
dtype: bool
0    False
1     True
2    False
3    False
4    False
dtype: bool



In [38]:

    
# 重複したデータの削除
print(df.drop_duplicates())
print(df.drop_duplicates(0)) # 0列目が重複しているデータの削除
print(df.drop_duplicates(0, keep='last')) # 重複している時に一番最後のデータを残す

第3章 Pandas: DataFrameの変形

https://openbook4.me/projects/183/sections/1369



In [68]:

    
df = pd.DataFrame(np.reshape(np.arange(9), (3, 3)),
                  columns=['a', 'b', 'c'])
df



In [47]:

    
# DataFrameの一部を取り出す
print(df.head(2)) # 先頭2行
print(df.tail(2)) # 後ろ2行



In [48]:

    
print(df.index) # インデックス(行の情報を取得)
print(df.columns) # 列の情報を取得









    



RangeIndex(start=0, stop=3, step=1)
Index(['a', 'b', 'c'], dtype='object')



In [49]:

    
# DataFrameの形式を変える
print(df.T) # indexとcolumnsを入れ替える



In [50]:

    
df.sort_index(axis=1, ascending=False) # columnsを逆順にする



In [51]:

    
df.sort_values(by='b', ascending=False) # 列の値を使ってソートする



In [52]:

    
df.sort_values(by='b') # 列の値を使ってソートする



In [53]:

    
# マスクする(ほしい条件のもとで、dfから選びとる)
print(df.a)
print(df.a > 2)
df[df.a > 2] # a列の値が2より大きいデータだけを取り出す









    



0    0
1    3
2    6
Name: a, dtype: int64
0    False
1     True
2     True
Name: a, dtype: bool






    Out[53]:






  
    
      
      a
      b
      c
    
  
  
    
      1
      3
      4
      5
    
    
      2
      6
      7
      8



In [54]:

    
df[df > 3] # 3より大きい値だけを取り出す



In [57]:

    
new_df = df.copy()
new_df[new_df > 3] = new_df * 2 # 3より大きい値だけ2倍する
new_df



In [70]:

    
# 列の追加
new_df = df.copy()
new_df['e'] = ['one','two','three']
new_df



In [72]:

    
# NaNのデータを除去する
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html
df_with_nan = df[df > 3]
df_with_nan



In [73]:

    
df_with_nan.dropna() # NaNがある行を削除



In [74]:

    
df_with_nan.dropna(how='all') # 全てがNaNの行を削除



In [75]:

    
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html
df_with_nan.fillna('Ice') # NaNを別な値で埋める



In [76]:

    
# DataFrameをくっつける
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html#pandas.concat
df2 = pd.DataFrame(np.reshape(np.arange(6), (3,2)),
                   columns=['e', 'f'])
df2



In [78]:

    
# 列でくっつける
new_df = pd.concat([df, df2], axis=1)
new_df



In [81]:

    
# 行でくっつける
new_df = pd.concat([df, df2], axis=0)
new_df



In [82]:

    
df3 = pd.DataFrame(np.reshape(np.arange(6), (3,2)),
                   columns=['a', 'b'])
df3



In [83]:

    
new_df = pd.concat([df, df3], axis=0)
new_df



In [85]:

    
# 行に対してデータを追加する
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html#pandas.DataFrame.append
new_df = df.append(df3)
new_df



In [86]:



In [ ]:

	a	b	c	e	f
0	0.0	1.0	2.0	NaN	NaN
1	3.0	4.0	5.0	NaN	NaN
2	6.0	7.0	8.0	NaN	NaN
0	NaN	NaN	NaN	0.0	1.0
1	NaN	NaN	NaN	2.0	3.0
2	NaN	NaN	NaN	4.0	5.0

	a	b	c	e	f
0	0.0	1.0	2.0	NaN	NaN
1	3.0	4.0	5.0	NaN	NaN
2	6.0	7.0	8.0	NaN	NaN
0	NaN	NaN	NaN	0.0	1.0
1	NaN	NaN	NaN	2.0	3.0
2	NaN	NaN	NaN	4.0	5.0

	a	b	c	e	f
0	0.0	1.0	2.0	NaN	NaN
1	3.0	4.0	5.0	NaN	NaN
2	6.0	7.0	8.0	NaN	NaN
0	NaN	NaN	NaN	0.0	1.0
1	NaN	NaN	NaN	2.0	3.0
2	NaN	NaN	NaN	4.0	5.0